library(tidyverse)
library(gt)
library(kableExtra)
library(DT)
names_a <- read_csv(here::here("Week9", "Lab", "StateNames_A.csv"))
names_a <- names_a |>
rename(Sex = Gender)Challenge 9
Revision
My interactive table was grouped, when it should have been ungrouped to allow for more searching.
Setup
3. Summarizing & Visualizing the Number of Allisons
1.
allison_data <- names_a |>
filter(Name == "Allison") |>
select(-Year) |>
group_by(State, Sex) |>
summarize(n = sum(Count)) |>
pivot_wider(names_from = Sex,
values_from = n,
values_fill = 0) |>
rename(Male = M, Female = F)
allison_data |>
head(n = 15) |>
knitr::kable(
caption = "Number of Babies Named \"Allison\" by State and Sex (head=15)"
) |>
kable_styling(font_size = 15)| State | Female | Male |
|---|---|---|
| AK | 232 | 0 |
| AL | 1535 | 0 |
| AR | 1198 | 0 |
| AZ | 1880 | 0 |
| CA | 12413 | 0 |
| CO | 1594 | 0 |
| CT | 1099 | 0 |
| DC | 321 | 0 |
| DE | 294 | 0 |
| FL | 4455 | 0 |
| GA | 3257 | 0 |
| HI | 183 | 0 |
| IA | 1477 | 0 |
| ID | 451 | 0 |
| IL | 5110 | 0 |
DT::datatable(names_a)2.
allison_f <- names_a |>
filter(Name == "Allison",
Sex == "F") |>
select(-State) |>
group_by(Year) |>
summarize(n = sum(Count))
allison_f |>
head(n = 15) |>
knitr::kable(
caption = "Number of Babies Named \"Allison\" with Female Sex by Year (head=15)"
) |>
kable_styling(font_size = 15)| Year | n |
|---|---|
| 1997 | 7274 |
| 1998 | 7861 |
| 1999 | 7023 |
| 2000 | 6314 |
| 2001 | 6209 |
| 2002 | 6237 |
| 2003 | 5850 |
| 2004 | 5871 |
| 2005 | 5631 |
| 2006 | 5560 |
| 2007 | 5450 |
| 2008 | 6237 |
| 2009 | 6579 |
| 2010 | 5856 |
| 2011 | 5453 |
3.
allison_f |>
ggplot(mapping = aes(x = Year, y = n)) +
geom_line() +
geom_point() +
labs(y = NULL,
title = "Number of Babies With The Name Allison By Year"
)4. Modeling the Number of Allisons
- Fit a linear model with the year as the explanatory variable, and the number of Allisons as the response. Similar to #3, each year should have one observation–the total number of Allisons born that year.
allison_lm <- lm(n ~ Year, data = allison_f)Write out the estimated regression equation.
n = 209689.8 - 101*Year
- Plot the residuals of the model, that is, the actual values minus the predicted values. Comment on the residuals - do you see any patterns?
allison_lm |>
broom::augment() |>
ggplot(mapping = aes(y = .resid, x = .fitted)) +
geom_point() +
labs(x = "Fitted",
y = "Residuals",
title = "Residuals of the Allison Model"
)The residuals start tight and spread out over time on the y-axis, so at the end they are very spread out. They almost seem to trend downward too.
- From the spread of this model, we can see the name is very varied in popularity, but does seem to be more unpopular now
5. Spelling by State
8.
alan_spelling <- names_a |>
filter(Name %in% c("Allan", "Alan", "Allen")) |>
group_by(Name, Year) |>
summarize(n = sum(Count))
alan_spelling %>%
head(n = 15) %>%
knitr::kable(
caption = "Number of Babies Named Different Allen Spellings by Year (head=15)",
booktabs = TRUE
) %>%
kable_styling(font_size = 10,
latex_options = "striped",
)| Name | Year | n |
|---|---|---|
| Alan | 1997 | 2155 |
| Alan | 1998 | 2102 |
| Alan | 1999 | 2220 |
| Alan | 2000 | 2398 |
| Alan | 2001 | 2622 |
| Alan | 2002 | 2591 |
| Alan | 2003 | 3083 |
| Alan | 2004 | 3051 |
| Alan | 2005 | 3189 |
| Alan | 2006 | 3442 |
| Alan | 2007 | 3225 |
| Alan | 2008 | 3002 |
| Alan | 2009 | 2817 |
| Alan | 2010 | 2487 |
| Alan | 2011 | 2321 |
alan_spelling |>
ggplot(mapping = aes(x = Year, y = n, color = Name)) +
geom_line() +
geom_point() +
labs(
y = NULL,
title = "Number of Babies with Different Spellings of Allen by Year"
)9.
allan_state <- names_a |>
filter(Name %in% c("Allan", "Alan", "Allen"),
Year == 2000,
Sex == "M",
State %in% c("CA", "PA")) |>
select(-Sex, -Year) |>
group_by(Name, State)
allan_state_wide <- allan_state |>
pivot_wider(names_from = Name,
values_from = Count,
values_fill = 0)
allan_state_wide |>
knitr::kable(
caption = "Babies Born with Differing Spellings of Alan by State"
)| State | Alan | Allen | Allan |
|---|---|---|---|
| CA | 579 | 176 | 131 |
| PA | 51 | 56 | 12 |
allan_percent <- allan_state |>
group_by(State) |>
mutate(prop = Count / sum(Count)) |>
select(-Count) |>
pivot_wider(names_from = Name,
values_from = prop,
values_fill = 0)
# Why is it moving the state column???????
allan_percent %>%
gt() %>%
fmt_percent(columns = Alan:Allan, decimals = 2)| Alan | Allen | Allan |
|---|---|---|
| CA | ||
| 65.35% | 19.86% | 14.79% |
| PA | ||
| 42.86% | 47.06% | 10.08% |